3. Pandas - ETL tools


In [2]:
%pylab
from pandas import Series, DataFrame
import pandas as pd
import re


Using matplotlib backend: Qt4Agg
Populating the interactive namespace from numpy and matplotlib

合併數據集

數據庫風格的DataFrame合併


In [3]:
# pandas 的 merge() 方法
df1 = DataFrame({'key': list('bbacaabd'), 
                 'data1': range(8)})
df2 = DataFrame({'key': list('abd'), 
                 'data2': range(3)})

In [4]:
df1


Out[4]:
data1 key
0 0 b
1 1 b
2 2 a
3 3 c
4 4 a
5 5 a
6 6 b
7 7 d

In [5]:
df2


Out[5]:
data2 key
0 0 a
1 1 b
2 2 d

In [6]:
# 用 merge() 方法,預設以共同的column 'key' 來 join 兩個 DataFrame
pd.merge(df1, df2)


Out[6]:
data1 key data2
0 0 b 1
1 1 b 1
2 6 b 1
3 2 a 0
4 4 a 0
5 5 a 0
6 7 d 2

In [7]:
# 也可以顯式的指定 要以哪一個欄位來 join
pd.merge(df1, df2, on = 'key')


Out[7]:
data1 key data2
0 0 b 1
1 1 b 1
2 6 b 1
3 2 a 0
4 4 a 0
5 5 a 0
6 7 d 2

In [8]:
# 如果兩個物件的列名不同,也可以分別指定
df3 = DataFrame({'lkey': list('bbacaabd'), 
                 'data1': range(8)})
df4 = DataFrame({'rkey': list('abd'), 
                 'data2': range(3)})

In [9]:
df3


Out[9]:
data1 lkey
0 0 b
1 1 b
2 2 a
3 3 c
4 4 a
5 5 a
6 6 b
7 7 d

In [10]:
df4


Out[10]:
data2 rkey
0 0 a
1 1 b
2 2 d

In [11]:
# 分別指定各要以哪一個欄位來join
pd.merge(df3, df4, left_on = 'lkey', right_on = 'rkey')
# lkey = 'c' 的項目不會出現,因為 merge()方法預設以 inner join 的模式來merge


Out[11]:
data1 lkey data2 rkey
0 0 b 1 b
1 1 b 1 b
2 6 b 1 b
3 2 a 0 a
4 4 a 0 a
5 5 a 0 a
6 7 d 2 d

In [12]:
# 可以以 how 參數指定 join的模式 (outer 聯集)
pd.merge(df3, df4, left_on = 'lkey', right_on = 'rkey', how = 'outer')


Out[12]:
data1 lkey data2 rkey
0 0 b 1.0 b
1 1 b 1.0 b
2 6 b 1.0 b
3 2 a 0.0 a
4 4 a 0.0 a
5 5 a 0.0 a
6 3 c NaN NaN
7 7 d 2.0 d

In [13]:
# 可以以 how 參數指定 join的模式 (inner)
pd.merge(df3, df4, left_on = 'lkey', right_on = 'rkey', how = 'inner')


Out[13]:
data1 lkey data2 rkey
0 0 b 1 b
1 1 b 1 b
2 6 b 1 b
3 2 a 0 a
4 4 a 0 a
5 5 a 0 a
6 7 d 2 d

In [14]:
# 可以以 how 參數指定 join的模式 (left)
pd.merge(df3, df4, left_on = 'lkey', right_on = 'rkey', how = 'left')


Out[14]:
data1 lkey data2 rkey
0 0 b 1.0 b
1 1 b 1.0 b
2 2 a 0.0 a
3 3 c NaN NaN
4 4 a 0.0 a
5 5 a 0.0 a
6 6 b 1.0 b
7 7 d 2.0 d

In [15]:
# 可以以 how 參數指定 join的模式 (right)
pd.merge(df3, df4, left_on = 'lkey', right_on = 'rkey', how = 'right')


Out[15]:
data1 lkey data2 rkey
0 0 b 1 b
1 1 b 1 b
2 6 b 1 b
3 2 a 0 a
4 4 a 0 a
5 5 a 0 a
6 7 d 2 d

In [16]:
# 多對多的合併
df1 = DataFrame({'key': list('bbacab'), 
                'data1': range(6)})
df2 = DataFrame({'key': list('ababd'), 
                'data2': range(5)})

In [17]:
df1


Out[17]:
data1 key
0 0 b
1 1 b
2 2 a
3 3 c
4 4 a
5 5 b

In [18]:
df2


Out[18]:
data2 key
0 0 a
1 1 b
2 2 a
3 3 b
4 4 d

In [19]:
# 因為 df2中 key a, b 都有對應多個值,所以會產生多個對應 rows
pd.merge(df1, df2, on = 'key', how = 'left')


Out[19]:
data1 key data2
0 0 b 1.0
1 0 b 3.0
2 1 b 1.0
3 1 b 3.0
4 2 a 0.0
5 2 a 2.0
6 3 c NaN
7 4 a 0.0
8 4 a 2.0
9 5 b 1.0
10 5 b 3.0

In [20]:
# 因為 df2中 key a, b 都有對應多個值,所以會產生多個對應 rows
pd.merge(df1, df2, on = 'key', how = 'inner')
# inner join 是求交集,所以不會有 NaN的值出現


Out[20]:
data1 key data2
0 0 b 1
1 0 b 3
2 1 b 1
3 1 b 3
4 5 b 1
5 5 b 3
6 2 a 0
7 2 a 2
8 4 a 0
9 4 a 2

In [21]:
# 可以根據多個 keys來 join
df1 = DataFrame({'key1': ['foo', 'foo', 'bar'], 
                 'key2': ['one', 'two', 'one'], 
                 'data': [1, 2, 3]})
df2 = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'], 
                 'key2': ['one', 'one', 'one', 'two'], 
                 'data': [4, 5, 6, 7]})

In [22]:
df1


Out[22]:
data key1 key2
0 1 foo one
1 2 foo two
2 3 bar one

In [23]:
df2


Out[23]:
data key1 key2
0 4 foo one
1 5 foo one
2 6 bar one
3 7 bar two

In [24]:
# 可以根據多個 keys來 join
pd.merge(df1, df2, on = ['key1', 'key2'], how = 'outer')


Out[24]:
data_x key1 key2 data_y
0 1.0 foo one 4.0
1 1.0 foo one 5.0
2 2.0 foo two NaN
3 3.0 bar one 6.0
4 NaN bar two 7.0

In [25]:
# 如果列名重複 ('key2')
pd.merge(df1, df2, on = ['key1'], how = 'outer')


Out[25]:
data_x key1 key2_x data_y key2_y
0 1 foo one 4 one
1 1 foo one 5 one
2 2 foo two 4 one
3 2 foo two 5 one
4 3 bar one 6 one
5 3 bar one 7 two

In [26]:
# 如果列名重複 ('key2')
# 使用 suffix 參數來指定附加到左右兩個 DataFrame重複列名的名稱上
pd.merge(df1, df2, on = ['key1'], how = 'outer', suffixes = ['_left', '_right'])


Out[26]:
data_left key1 key2_left data_right key2_right
0 1 foo one 4 one
1 1 foo one 5 one
2 2 foo two 4 one
3 2 foo two 5 one
4 3 bar one 6 one
5 3 bar one 7 two

In [27]:
# 設定 sort 參數, 依據 keys來排序
pd.merge(df1, df2, on = ['key1', 'key2'], how = 'outer', suffixes = ['_left', '_right'], sort = True)


Out[27]:
data_left key1 key2 data_right
0 3.0 bar one 6.0
1 NaN bar two 7.0
2 1.0 foo one 4.0
3 1.0 foo one 5.0
4 2.0 foo two NaN

索引上的合併


In [28]:
# 使用DataFrame的索引作為 join的 key
dfl = DataFrame({'key': list('abaabc'), 
                 'value': range(6)})
# dfr 以 ['a', 'b'] 作為索引
dfr = DataFrame({'value': [3.5, 7]}, index = list('ab'))

In [29]:
dfl


Out[29]:
key value
0 a 0
1 b 1
2 a 2
3 a 3
4 b 4
5 c 5

In [30]:
# dfr 以 ['a', 'b'] 作為索引
dfr


Out[30]:
value
a 3.5
b 7.0

In [31]:
# 設定 right_index = True,表示 right DataFrame使用索引作為 join 的 key欄位
pd.merge(dfl, dfr, left_on = 'key', right_index = True, how = 'outer', suffixes = ['_left', '_right'], sort = True)


Out[31]:
key value_left value_right
0 a 0 3.5
2 a 2 3.5
3 a 3 3.5
1 b 1 7.0
4 b 4 7.0
5 c 5 NaN

In [32]:
# 階層化索引
dfl = DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 
                 'key2': [2000, 2001, 2002, 2001, 2002], 
                 'data': np.arange(5.)})
dfr = DataFrame(np.arange(12).reshape((6, 2)), 
                columns = ['data1', 'data2'],
                index = [['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
                         [2001, 2000, 2000, 2000, 2001, 2002]])
dfr.index.names = ['state', 'year']

In [33]:
dfl


Out[33]:
data key1 key2
0 0.0 Ohio 2000
1 1.0 Ohio 2001
2 2.0 Ohio 2002
3 3.0 Nevada 2001
4 4.0 Nevada 2002

In [34]:
dfr


Out[34]:
data1 data2
state year
Nevada 2001 0 1
2000 2 3
Ohio 2000 4 5
2000 6 7
2001 8 9
2002 10 11

In [35]:
# 階層化索引
# 左方指定用來join的 keys:  left_on = ['key1', 'key2']
# 右方指定使用索引來做為 join的 keys: right_index = True
pd.merge(dfl, dfr, left_on = ['key1', 'key2'], right_index = True, how = 'outer', suffixes = ['_left', '_right'], sort = True)


Out[35]:
data key1 key2 data1 data2
4 NaN Nevada 2000.0 2.0 3.0
3 3.0 Nevada 2001.0 0.0 1.0
4 4.0 Nevada 2002.0 NaN NaN
0 0.0 Ohio 2000.0 4.0 5.0
0 0.0 Ohio 2000.0 6.0 7.0
1 1.0 Ohio 2001.0 8.0 9.0
2 2.0 Ohio 2002.0 10.0 11.0

In [36]:
# 同時使用合併雙方的索引
dfl = DataFrame([[1., 2.], [3., 4.], [5., 6.]], 
                index = list('ace'), 
                columns = ['Ohio', 'Nevada'])
dfr = DataFrame([[7., 8.], [9., 10.], [11., 12.], [13., 14]],
                index = list('bcde'), 
                columns = ['Missouri', 'Alabama'])

In [37]:
dfl


Out[37]:
Ohio Nevada
a 1.0 2.0
c 3.0 4.0
e 5.0 6.0

In [38]:
dfr


Out[38]:
Missouri Alabama
b 7.0 8.0
c 9.0 10.0
d 11.0 12.0
e 13.0 14.0

In [39]:
pd.merge(dfl, dfr, left_index = True, right_index = True, how = 'outer', suffixes = ['_left', '_right'], sort = True )


Out[39]:
Ohio Nevada Missouri Alabama
a 1.0 2.0 NaN NaN
b NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0
d NaN NaN 11.0 12.0
e 5.0 6.0 13.0 14.0

軸向連接(concatenation)


In [40]:
# NumPy中有 concatenate()方法
arr = np.arange(12).reshape((3, 4))
arr


Out[40]:
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [41]:
# NumPy 的 concatenate()方法
np.concatenate([arr, arr], axis = 1)


Out[41]:
array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [42]:
# pandas 有 concat()方法
s1 = Series([0, 1], index = ['a', 'b'])
s2 = Series([2, 3, 4], index = ['c', 'd', 'e'])
s3 = Series([5, 6], index = ['f', 'g'])
sc = pd.concat([s1, s2, s3])
sc


Out[42]:
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [43]:
type(sc)


Out[43]:
pandas.core.series.Series

In [44]:
# concat()方法預設以 axis = 0 來連接,如果傳入 axis = 1,則會產生一個 DataFrame
sc = pd.concat([s1, s2, s3], axis = 1)
sc


Out[44]:
0 1 2
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0

In [45]:
type(sc)


Out[45]:
pandas.core.frame.DataFrame

In [46]:
# 傳入 join = 'inner' 可以看到交集
s4 = pd.concat([s1 * 5, s3])
s4


Out[46]:
a    0
b    5
f    5
g    6
dtype: int64

In [47]:
s1


Out[47]:
a    0
b    1
dtype: int64

In [48]:
pd.concat([s1, s4], axis = 1)


Out[48]:
0 1
a 0.0 0
b 1.0 5
f NaN 5
g NaN 6

In [49]:
# 傳入 join = 'inner' 可以看到交集
sc = pd.concat([s1, s4], axis = 1, join = 'inner')
sc


Out[49]:
0 1
a 0 0
b 1 5

In [50]:
# 透過 join_axes 參數,指定要在其他軸上使用的索引
pd.concat([s1, s4], axis = 1, join_axes = [['a', 'c', 'b', 'e']])


Out[50]:
0 1
a 0.0 0.0
c NaN NaN
b 1.0 5.0
e NaN NaN

In [51]:
s3


Out[51]:
f    5
g    6
dtype: int64

In [52]:
# 使用 keys 參數,建立 階層式索引
result = pd.concat([s1, s1, s3], keys = ['one', 'two', 'three'])
result


Out[52]:
one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

In [53]:
# 把具有層次化索引的 Series, unstack 成為 DataFrame
result.unstack()


Out[53]:
a b f g
one 0.0 1.0 NaN NaN
two 0.0 1.0 NaN NaN
three NaN NaN 5.0 6.0

In [54]:
# 沿著 axis = 1 做 concat,keys就會成為 列頭
result = pd.concat([s1, s1, s3], axis = 1, keys = ['one', 'two', 'three'])
result


Out[54]:
one two three
a 0.0 0.0 NaN
b 1.0 1.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0

In [55]:
# 同樣的邏輯對 DataFrame也是一樣的
# 沿著 axis = 1 做 concat,keys就會成為 列頭
df1 = DataFrame(np.arange(6).reshape((3, 2)), 
                index = ['a', 'b', 'c'], 
                columns = ['one', 'two'])
df2 = DataFrame(5 + np.arange(4).reshape((2, 2)), 
                index = ['a', 'c'], 
                columns = ['three', 'four'])
pd.concat([df1, df2], axis = 1, keys = ['level1', 'level2'])


Out[55]:
level1 level2
one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0

In [56]:
# 傳入一個字典,則字典的鍵就會被當作keys參數的值
# 這種表達方式比較容易讀懂
pd.concat({'level1': df1, 'level2': df2}, axis = 1)


Out[56]:
level1 level2
one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0

In [57]:
# names 參數,設定層次化所引的名稱
pd.concat({'level1': df1, 'level2': df2}, axis = 1, names = ['upper', 'lower'])


Out[57]:
upper level1 level2
lower one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0

In [58]:
# 和當下分析工作無關的row索引
df1 = DataFrame(np.random.randn(3, 4), columns = list('abcd'))
df2 = DataFrame(np.random.randn(2, 3), columns = list('bda'))

In [59]:
df1


Out[59]:
a b c d
0 0.647506 1.221125 -0.566824 -1.791710
1 1.282575 0.190757 0.551807 -0.943677
2 1.798358 -0.444229 -0.329494 0.051920

In [60]:
df2


Out[60]:
b d a
0 1.244561 1.015036 -1.537533
1 -0.420767 0.815294 -0.301980

In [61]:
# concat之後,會保留原來的索引
pd.concat([df1, df2])


Out[61]:
a b c d
0 0.647506 1.221125 -0.566824 -1.791710
1 1.282575 0.190757 0.551807 -0.943677
2 1.798358 -0.444229 -0.329494 0.051920
0 -1.537533 1.244561 NaN 1.015036
1 -0.301980 -0.420767 NaN 0.815294

In [62]:
# ignore_index = True,不保留原本的索引
pd.concat([df1, df2], ignore_index = True)


Out[62]:
a b c d
0 0.647506 1.221125 -0.566824 -1.791710
1 1.282575 0.190757 0.551807 -0.943677
2 1.798358 -0.444229 -0.329494 0.051920
3 -1.537533 1.244561 NaN 1.015036
4 -0.301980 -0.420767 NaN 0.815294

合併重疊數據


In [63]:
# NumPy的 where 函數
a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], 
           index = list('abcdef'))
b = Series(np.arange(len(a)), dtype = np.float64, 
           index = list('abcdef'))

In [64]:
a


Out[64]:
a    NaN
b    2.5
c    NaN
d    3.5
e    4.5
f    NaN
dtype: float64

In [65]:
b[-1] = np.nan

In [66]:
b


Out[66]:
a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
f    NaN
dtype: float64

In [67]:
# NumPy的 where 函數,是一種向量化的 if-else
np.where(pd.isnull(a), b, a)


Out[67]:
array([ 0. ,  2.5,  2. ,  3.5,  4.5,  nan])

In [68]:
b[:-2]


Out[68]:
a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [69]:
a[2:]


Out[69]:
c    NaN
d    3.5
e    4.5
f    NaN
dtype: float64

In [70]:
# Series 的 combine_first()方法,也是一樣的功能,且會自動對齊數據
b[:-2].combine_first(a[2:])


Out[70]:
a    0.0
b    1.0
c    2.0
d    3.0
e    4.5
f    NaN
dtype: float64

In [71]:
# 對於DataFrame,combine_first的功能就像是在對缺失數據 打補釘
df1 = DataFrame({'a': [1., np.nan, 5., np.nan], 
                 'b': [np.nan, 2., np.nan, 5.], 
                 'c': list(range(2, 18, 4))})
df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.], 
                 'b': [np.nan, 3., 4., 6., 8.]})

In [72]:
df1


Out[72]:
a b c
0 1.0 NaN 2
1 NaN 2.0 6
2 5.0 NaN 10
3 NaN 5.0 14

In [73]:
df2


Out[73]:
a b
0 5.0 NaN
1 4.0 3.0
2 NaN 4.0
3 3.0 6.0
4 7.0 8.0

In [74]:
# 對於 df1中的缺失數據,會嘗試以df2中的對應數據補充
df1.combine_first(df2)


Out[74]:
a b c
0 1.0 NaN 2.0
1 4.0 2.0 6.0
2 5.0 4.0 10.0
3 3.0 5.0 14.0
4 7.0 8.0 NaN

重塑(reshape)和軸向旋轉(pivot)

重塑層次化索引


In [75]:
# 主要兩種方法
# stack: 將 column 旋轉為 row
# unstack: 將 row 旋轉為 column

In [76]:
df = DataFrame(np.arange(6).reshape((2, 3)), 
               index = pd.Index(['Ohio', 'Colorado'], name = 'state'), 
               columns = pd.Index(['one', 'two', 'three'], name = 'number'))
df


Out[76]:
number one two three
state
Ohio 0 1 2
Colorado 3 4 5

In [77]:
# stack: 將 column 旋轉為 row
s = df.stack()
s


Out[77]:
state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [78]:
# s 是一個 Series 物件 
type(s)


Out[78]:
pandas.core.series.Series

In [79]:
# unstack: 將 row 旋轉為 column
# Series會變成一個 DataFrame
s.unstack()


Out[79]:
number one two three
state
Ohio 0 1 2
Colorado 3 4 5

In [80]:
# 默認情況下,stack, unstack 操作的是最內層
# 可以傳入分層級別的編號或者名噌,以對其他級別操作
s.unstack(0)


Out[80]:
state Ohio Colorado
number
one 0 3
two 1 4
three 2 5

In [81]:
s.unstack('state')


Out[81]:
state Ohio Colorado
number
one 0 3
two 1 4
three 2 5

In [82]:
# 如果不是所有的級別值都可以在分組中找到的話,則unstack操作可以會產生缺失數據
s1 = Series([0, 1, 2, 3], index = list('abcd'))
s2 = Series([4, 5, 6], index = list('cde'))
data2 = pd.concat([s1, s2], keys = ['one', 'two'])
data2


Out[82]:
one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

In [83]:
# unstack操作可以會產生缺失數據
data2.unstack()


Out[83]:
a b c d e
one 0.0 1.0 2.0 3.0 NaN
two NaN NaN 4.0 5.0 6.0

In [84]:
# stack 預設會濾除缺失數據,因此 stack/unstack 是可逆的
data2.unstack().stack()


Out[84]:
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

In [85]:
# 也可以設定 dropna 參數,不要濾除缺失數據
data2.unstack().stack(dropna = False)


Out[85]:
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

In [86]:
# unstack操作中,旋轉軸的級別將會成為結果中的最低級別
df = DataFrame({'left': s, 'right': s + 5}, 
               columns = pd.Index(['left', 'right'], name = 'side'))
df


Out[86]:
side left right
state number
Ohio one 0 5
two 1 6
three 2 7
Colorado one 3 8
two 4 9
three 5 10

In [87]:
# 索引'state'經過unstack之後,成為最內層的 column索引
df.unstack('state')


Out[87]:
side left right
state Ohio Colorado Ohio Colorado
number
one 0 3 5 8
two 1 4 6 9
three 2 5 7 10

In [88]:
df.unstack('state').unstack('side')


Out[88]:
side   state     number
left   Ohio      one        0
                 two        1
                 three      2
       Colorado  one        3
                 two        4
                 three      5
right  Ohio      one        5
                 two        6
                 three      7
       Colorado  one        8
                 two        9
                 three     10
dtype: int32

In [89]:
df


Out[89]:
side left right
state number
Ohio one 0 5
two 1 6
three 2 7
Colorado one 3 8
two 4 9
three 5 10

In [90]:
df.unstack('number').unstack('state')


Out[90]:
side   number  state   
left   one     Ohio         0
               Colorado     3
       two     Ohio         1
               Colorado     4
       three   Ohio         2
               Colorado     5
right  one     Ohio         5
               Colorado     8
       two     Ohio         6
               Colorado     9
       three   Ohio         7
               Colorado    10
dtype: int32

將"長格式"旋轉為"寬格式"


In [91]:
# 重新設定 ldata_string,不用依靠檔案載入
ldata_string = """
{"date":{"0":"1959\\/3\\/31","1":"1959\\/3\\/31","2":"1959\\/3\\/31","3":"1959\\/6\\/30","4":"1959\\/6\\/30","5":"1959\\/6\\/30","6":"1959\\/9\\/30","7":"1959\\/9\\/30","8":"1959\\/9\\/30"},"item":{"0":"realgdp","1":"infl","2":"unemp","3":"realgdp","4":"infl","5":"unemp","6":"realgdp","7":"infl","8":"unemp"},"value":{"0":2710.349,"1":0.0,"2":5.8,"3":2712.349,"4":2.0,"5":7.8,"6":2714.349,"7":4.0,"8":9.8}}
"""

In [92]:
import json
df = DataFrame(json.loads(ldata_string))
df
# 長格式 
# 好處: 值的種類可以隨時增加或減少
# 缺點: 操作起來較麻煩,不易閱讀


Out[92]:
date item value
0 1959/3/31 realgdp 2710.349
1 1959/3/31 infl 0.000
2 1959/3/31 unemp 5.800
3 1959/6/30 realgdp 2712.349
4 1959/6/30 infl 2.000
5 1959/6/30 unemp 7.800
6 1959/9/30 realgdp 2714.349
7 1959/9/30 infl 4.000
8 1959/9/30 unemp 9.800

In [93]:
# pivot()方法 可以將 長格式 轉換為 寬格式
# 總共需要 index, columns, values 三個參數
pivoted = df.pivot(index = 'date', columns = 'item', values = 'value')
pivoted


Out[93]:
item infl realgdp unemp
date
1959/3/31 0.0 2710.349 5.8
1959/6/30 2.0 2712.349 7.8
1959/9/30 4.0 2714.349 9.8

In [94]:
# 增加一列 value2
df['value2'] = np.random.randn(len(df))
df


Out[94]:
date item value value2
0 1959/3/31 realgdp 2710.349 -0.710806
1 1959/3/31 infl 0.000 0.317895
2 1959/3/31 unemp 5.800 -0.442984
3 1959/6/30 realgdp 2712.349 0.879221
4 1959/6/30 infl 2.000 1.314049
5 1959/6/30 unemp 7.800 -0.333095
6 1959/9/30 realgdp 2714.349 0.409834
7 1959/9/30 infl 4.000 -1.387617
8 1959/9/30 unemp 9.800 1.489864

In [95]:
# 如果只指定 index, columns,則DataFrame就會具有層次化的columns
pivoted = df.pivot(index = 'date', columns = 'item')
pivoted


Out[95]:
value value2
item infl realgdp unemp infl realgdp unemp
date
1959/3/31 0.0 2710.349 5.8 0.317895 -0.710806 -0.442984
1959/6/30 2.0 2712.349 7.8 1.314049 0.879221 -0.333095
1959/9/30 4.0 2714.349 9.8 -1.387617 0.409834 1.489864

In [96]:
pivoted['value'][:3]


Out[96]:
item infl realgdp unemp
date
1959/3/31 0.0 2710.349 5.8
1959/6/30 2.0 2712.349 7.8
1959/9/30 4.0 2714.349 9.8

In [97]:
# 也可以用 set_index建立層次化的索引,然後再用 unstack建置
df.set_index(['date', 'item'])


Out[97]:
value value2
date item
1959/3/31 realgdp 2710.349 -0.710806
infl 0.000 0.317895
unemp 5.800 -0.442984
1959/6/30 realgdp 2712.349 0.879221
infl 2.000 1.314049
unemp 7.800 -0.333095
1959/9/30 realgdp 2714.349 0.409834
infl 4.000 -1.387617
unemp 9.800 1.489864

In [98]:
df.set_index(['date', 'item']).unstack('item')


Out[98]:
value value2
item infl realgdp unemp infl realgdp unemp
date
1959/3/31 0.0 2710.349 5.8 0.317895 -0.710806 -0.442984
1959/6/30 2.0 2712.349 7.8 1.314049 0.879221 -0.333095
1959/9/30 4.0 2714.349 9.8 -1.387617 0.409834 1.489864

數據轉換

移除重複數據


In [99]:
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 
                  'k2': [1, 1, 2, 3, 3, 4, 4,]})
data


Out[99]:
k1 k2
0 one 1
1 one 1
2 one 2
3 two 3
4 two 3
5 two 4
6 two 4

In [100]:
# DataFrame 的 duplicated()方法傳回一個 boolean型態的 Series,表示各row是否重複
data.duplicated()


Out[100]:
0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [101]:
# drop_duplicates()方法 傳回移除重複項目之後的結果
data.drop_duplicates()


Out[101]:
k1 k2
0 one 1
2 one 2
3 two 3
5 two 4

In [102]:
data['k3'] = range(7)
data


Out[102]:
k1 k2 k3
0 one 1 0
1 one 1 1
2 one 2 2
3 two 3 3
4 two 3 4
5 two 4 5
6 two 4 6

In [103]:
# drop_duplicates()預設會對所有的columns來判斷是否有重複的 rows
data.duplicated()


Out[103]:
0    False
1    False
2    False
3    False
4    False
5    False
6    False
dtype: bool

In [104]:
# 也可以針對指定的columns來判斷是否有重複的 rows
data.duplicated(['k1'])


Out[104]:
0    False
1     True
2     True
3    False
4     True
5     True
6     True
dtype: bool

In [105]:
# duplicated, drop_duplicates 預設保留第一個出現的值組合
# 設定參數 keep = 'last',則會改為保留最後一個出現的值組合
data.duplicated(['k1'], keep = 'last')


Out[105]:
0     True
1     True
2    False
3     True
4     True
5     True
6    False
dtype: bool

利用函數或映射進行數據轉換


In [106]:
data = DataFrame({'food':['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'], 
                  'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data


Out[106]:
food ounces
0 bacon 4.0
1 pulled pork 3.0
2 bacon 12.0
3 Pastrami 6.0
4 corned beef 7.5
5 Bacon 8.0
6 pastrami 3.0
7 honey ham 5.0
8 nova lox 6.0

In [107]:
meat_to_animal = {'bacon': 'pig', 
                  'pulled pork': 'pig', 
                  'pastrami': 'cow', 
                  'corned beef': 'cow', 
                  'honey ham': 'pig', 
                  'nova lox': 'salmon'}

In [108]:
# Series 的 map()方法,可以將元素map給特定的 字典或函數 來進行轉換
# 需先規整大小寫,也是透過 map 對每個元素做 str.lower的操作
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data


Out[108]:
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 Bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon

In [109]:
# 也可以透過 lambda來做
data['animal'] = data['food'].map(lambda x: meat_to_animal[x.lower()])
data
# 使用 map()是實現元素級清理與轉換的便捷方式


Out[109]:
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 Bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon

替換值


In [110]:
data = Series([1., -999., 2., -999., -1000., 3.,])
data


Out[110]:
0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [111]:
# 用 replace()方法來置換數值
data.replace(-999, np.nan)


Out[111]:
0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [112]:
# 一次置換多個值。要被替換的包裝在一個list中
data.replace([-999, -1000], np.nan)


Out[112]:
0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [113]:
# 對不同值 進行不同的替換
# 要被替換的放在第一個 list, 替換者放在第二個 list,要匹配
data.replace([-999, -1000], [np.nan, 0])


Out[113]:
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [114]:
# 替代關係用 dict表達會比較清楚
data.replace({-999: np.nan, -1000: 0})


Out[114]:
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

重新命名軸索引

軸標籤也可以進行轉換,或者就地修改


In [115]:
data = DataFrame(np.arange(12).reshape((3, 4)), 
                 index = pd.Index(['Ohio', 'Colorado', 'New York'], name = 'state'), 
                 columns = pd.Index(['one', 'two', 'three', 'four'], name = 'quarter'))
data


Out[115]:
quarter one two three four
state
Ohio 0 1 2 3
Colorado 4 5 6 7
New York 8 9 10 11

In [116]:
# Index 也有一個 map()方法,可以傳回一個新的 Index物件
data.index = data.index.map(str.upper)
data


Out[116]:
quarter one two three four
OHIO 0 1 2 3
COLORADO 4 5 6 7
NEW YORK 8 9 10 11

In [117]:
# rename()方法會傳回一個數據集的轉換版本,而不是修改原來的數據
# 使用 index, columns 指定的函式 來修改軸標籤
data.rename(index = str.title, columns = str.upper)


Out[117]:
quarter ONE TWO THREE FOUR
Ohio 0 1 2 3
Colorado 4 5 6 7
New York 8 9 10 11

In [118]:
# rename 可以結合字典型物件,實現對部分軸標籤的更新
data.rename(index = {'OHIO': 'INDIANA'}, columns = {'three': 'peekaboo'})


Out[118]:
quarter one two peekaboo four
INDIANA 0 1 2 3
COLORADO 4 5 6 7
NEW YORK 8 9 10 11

In [119]:
data


Out[119]:
quarter one two three four
OHIO 0 1 2 3
COLORADO 4 5 6 7
NEW YORK 8 9 10 11

In [120]:
# 如果希望就地修改原有的數據集,使需要在 rename()方法中設定參數 inplace = True
data.rename(index = {'OHIO': 'INDIANA'}, columns = {'three': 'peekaboo'}, inplace = True)
data


Out[120]:
quarter one two peekaboo four
INDIANA 0 1 2 3
COLORADO 4 5 6 7
NEW YORK 8 9 10 11

檢測和過濾異常值(outlier)


In [121]:
# 常態分布陣列
np.random.seed(12345)
df = DataFrame(np.random.randn(1000, 4))
df.describe()


Out[121]:
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.067684 0.067924 0.025598 -0.002298
std 0.998035 0.992106 1.006835 0.996794
min -3.428254 -3.548824 -3.184377 -3.745356
25% -0.774890 -0.591841 -0.641675 -0.644144
50% -0.116401 0.101143 0.002073 -0.013611
75% 0.616366 0.780282 0.680391 0.654328
max 3.366626 2.653656 3.260383 3.927528

In [122]:
# 找出某列中,絕對值大於3的數值
col = df[2]
col[np.abs(col) > 3]


Out[122]:
5      3.248944
102    3.176873
324    3.260383
499   -3.056990
586   -3.184377
Name: 2, dtype: float64

In [123]:
# 找出所有 含有絕對值大於3的數值 的row,可以運用 any()
df[(np.abs(df) > 3).any(axis = 1)]


Out[123]:
0 1 2 3
5 -0.539741 0.476985 3.248944 -1.021228
97 -0.774363 0.552936 0.106061 3.927528
102 -0.655054 -0.565230 3.176873 0.959533
305 -2.315555 0.457246 -0.025907 -3.399312
324 0.050188 1.951312 3.260383 0.963301
400 0.146326 0.508391 -0.196713 -3.745356
499 -0.293333 -0.242459 -3.056990 1.918403
523 -3.428254 -0.296336 -0.439938 -0.867165
586 0.275144 1.179227 -3.184377 1.369891
808 -0.362528 -3.548824 1.553205 -2.186301
900 3.366626 -2.372214 0.851010 1.332846

In [124]:
# 將陣列數值限制在 +-3之間
gt3 = (np.abs(df) > 3)
df[gt3] = np.sign(df) * 3
df.describe()


Out[124]:
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.067623 0.068473 0.025153 -0.002081
std 0.995485 0.990253 1.003977 0.989736
min -3.000000 -3.000000 -3.000000 -3.000000
25% -0.774890 -0.591841 -0.641675 -0.644144
50% -0.116401 0.101143 0.002073 -0.013611
75% 0.616366 0.780282 0.680391 0.654328
max 3.000000 2.653656 3.000000 3.000000

字符串操作

pandas中向量化的字串函數


In [125]:
data = Series({'Dave': 'dave@google.com', 
        'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 
        'Wes': np.nan})
data


Out[125]:
Dave     dave@google.com
Rob        rob@gmail.com
Steve    steve@gmail.com
Wes                  NaN
dtype: object

In [126]:
# 透過 Series 的 str屬性 可以訪問一些字串的方法
data.str.contains('gmail')


Out[126]:
Dave     False
Rob       True
Steve     True
Wes        NaN
dtype: object

In [127]:
# 是一個 StringMethods物件,之下掛了很多字串方法
data.str


Out[127]:
<pandas.core.strings.StringMethods at 0x89c52e8>

In [128]:
# .str 之下也掛有 reqular expression 的一些方法
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

# reg 的 findall()方法
data.str.findall(pattern, flags = re.IGNORECASE)


Out[128]:
Dave     [(dave, google, com)]
Rob        [(rob, gmail, com)]
Steve    [(steve, gmail, com)]
Wes                        NaN
dtype: object

In [129]:
# reg 的 match()方法
matchs = data.str.match(pattern, flags = re.IGNORECASE)
matchs


D:\Python\lib\site-packages\ipykernel\__main__.py:2: FutureWarning: In future versions of pandas, match will change to always return a bool indexer.
  from ipykernel import kernelapp as app
Out[129]:
Dave     (dave, google, com)
Rob        (rob, gmail, com)
Steve    (steve, gmail, com)
Wes                      NaN
dtype: object

In [130]:
# 提取匹配結果中 索引為 1 的元素
matchs.str.get(1)


Out[130]:
Dave     google
Rob       gmail
Steve     gmail
Wes         NaN
dtype: object

In [131]:
matchs.str[0]


Out[131]:
Dave      dave
Rob        rob
Steve    steve
Wes        NaN
dtype: object

In [132]:
# 對字串進行子串擷取
data.str[:5]


Out[132]:
Dave     dave@
Rob      rob@g
Steve    steve
Wes        NaN
dtype: object